This Notebook represents the first fully functional pipeline

The pipeline includes three steps:

  1. Encoding
  2. Decoding
  3. Calculating error

GOAL

  1. Calculate the number of distinct templates, coverage, and reconstruction error for the bottom up approach with various thresholds

In [29]:
# Necessary imports 
import os
import time
from nbminer.notebook_miner import NotebookMiner
from nbminer.cells.cells import Cell
from nbminer.features.ast_features import ASTFeatures
from nbminer.stats.summary import Summary
from nbminer.stats.multiple_summary import MultipleSummary
from nbminer.encoders.ast_graph.ast_graph import *

In [30]:
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
    person = os.path.join('../testbed/Final', person)
    if os.path.isdir(person):
        direc = os.listdir(person)
        notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks]
a = ASTFeatures(notebook_objs)

In [31]:
for i, nb in enumerate(a.nb_features):
    a.nb_features[i] = nb.get_new_notebook()

In [ ]:


In [32]:
agr = ASTGraphReducer(a, threshold=5, split_call=True)
agr.set_graphs()
num_nodes_orig = []

print ('Total number of graphs:',agr.number_graphs())
print ('Total number of graphs with one node:',agr.number_single())
print ('Total number of nodes:',agr.count_nodes())
print (agr.count_nodes())


Total number of graphs: 19882
Total number of graphs with one node: 0
Total number of nodes: 289657
289657

In [33]:
print (agr.count_nodes())
agr.encode()


289657

In [34]:
agr.decode()

In [36]:
print(agr.get_samples_str())


Original:
p.circle(eth_year, eth_favorites, color='red', size=8)

Reconstructed:
p.circle(eth_day, eth_retweets, color='red', size=8)

****************************************************************************************************
Original:
eth_tweets = eth_clean[['text']].dropna(axis=0)

Reconstructed:

****************************************************************************************************
Original:
epfl_volume_df['Hour'] = epfl_volume_df.apply(lambda x: x['created_at'].
    hour, axis=1)

Reconstructed:
epfl_en_sample_ML['Hour'] = epfl_en_sample_ML.apply(lambda x: x[
    'created_at'].hour, axis=1)

****************************************************************************************************
Original:
df_g_hour = df_g[['hour_tweet', 'favorite_count', 'retweet_count']].groupby([
    'hour_tweet']).sum()

Reconstructed:
df_g_month = df_g[['month_num_tweet', 'favorite_count', 'retweet_count']
    ].groupby(['month_num_tweet']).sum()

****************************************************************************************************
Original:
from nltk.corpus import opinion_lexicon

Reconstructed:
from sklearn.model_selection import train_test_split

****************************************************************************************************


In [8]:
from nbminer.reconstruction_error.astor_error import AstorError

In [9]:
ae = AstorError(a)
print (ae.get_summary())


The average length of the original strings is: 51.694545454545455
The average length of the reconstructed strings is: 17.176363636363636
The average edit distance is: 40.658181818181816
The average number of characters in common is: 13.31845583497618


In [ ]:


In [10]:
coverage = []
number_templates = []
avg_dist = []
avg_sim = []
for value in [2,5,10,20,30,50,10000]:
    print ('Calculating for value: ',value)
    people = os.listdir('../testbed/Final')
    notebooks = []
    for person in people:
        person = os.path.join('../testbed/Final', person)
        if os.path.isdir(person):
            direc = os.listdir(person)
            notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
    notebook_objs = [NotebookMiner(file) for file in notebooks]
    a = ASTFeatures(notebook_objs)
    for i, nb in enumerate(a.nb_features):
        a.nb_features[i] = nb.get_new_notebook()
    agr = ASTGraphReducer(a, threshold=value, split_call=True)
    agr.set_graphs()
    num_nodes_orig = []
    agr.encode()
    agr.decode()
    ae = AstorError(a)
    avg_dist.append(ae.average_distance())
    avg_sim.append(ae.average_similarity())
    coverage.append(ae.get_percent_coverage())
    number_templates.append(ae.get_unique_templates())


Calculating for value:  2
Calculating for value:  5
Calculating for value:  10
Calculating for value:  20
Calculating for value:  30
Calculating for value:  50
Calculating for value:  10000

In [ ]:


In [11]:
coverage_general = []
number_templates_general = []
avg_dist_general = []
avg_sim_general = []
for value in [2,5,10,20,30,50, 10000]:
    print ('Calculating for value: ',value)
    people = os.listdir('../testbed/Final')
    notebooks = []
    for person in people:
        person = os.path.join('../testbed/Final', person)
        if os.path.isdir(person):
            direc = os.listdir(person)
            notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
    notebook_objs = [NotebookMiner(file) for file in notebooks]
    a = ASTFeatures(notebook_objs)
    for i, nb in enumerate(a.nb_features):
        a.nb_features[i] = nb.get_new_notebook()
    agr = ASTGraphReducer(a, threshold=value, split_call=False)
    agr.set_graphs()
    num_nodes_orig = []
    agr.encode()
    agr.decode()
    ae = AstorError(a)
    avg_dist_general.append(ae.average_distance())
    avg_sim_general.append(ae.average_similarity())
    coverage_general.append(ae.get_percent_coverage())
    number_templates_general.append(ae.get_unique_templates())


Calculating for value:  2
Calculating for value:  5
Calculating for value:  10
Calculating for value:  20
Calculating for value:  30
Calculating for value:  50
Calculating for value:  10000

In [39]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (20, 20)
x = [2,5,10,20,30,50]

fig, axes = plt.subplots(2,2)
n1, = axes[0,0].plot(x, avg_dist[:6], label = 'Average Distance (Split function calls)')
n2, = axes[0,0].plot(x, avg_dist_general[:6], label = 'Average Distance (All calls are the same)')
n3, = axes[0,0].plot(x, [avg_dist_general[6] for i in range(6)], label = 'Bound')
axes[0,0].set_title("Average edit distance")
axes[0,0].set_xlabel('Threshold for Template Creation')
axes[0,0].set_ylabel('Average edit distance')
axes[0,0].legend(handles=[n1,n2,n3])
n1, = axes[0,1].plot(x, avg_sim[:6], label = 'Average Similarity (Split function calls)')
n2, = axes[0,1].plot(x, avg_sim_general[:6], label = 'Average Similarity (All calls are the same)')
n3, = axes[0,1].plot(x, [avg_sim_general[6] for i in range(6)], label = 'Bound')
axes[0,1].set_title("Average matching characters")
axes[0,1].set_xlabel('Threshold for Template Creation')
axes[0,1].set_ylabel('Average matching characters')
axes[0,1].legend(handles=[n1,n2,n3])
n1, = axes[1,0].plot(x, coverage[:6], label = 'Coverage (Split function calls)')
n2, = axes[1,0].plot(x, coverage_general[:6], label = 'Coverage (All calls are the same)')
n3, = axes[1,0].plot(x, [coverage_general[6] for i in range(6)], label = 'Bound')
axes[1,0].set_title("Coverage of templates")
axes[1,0].set_xlabel('Threshold for Template Creation')
axes[1,0].set_ylabel('Coverage of templates')
axes[1,0].legend(handles=[n1,n2,n3])
n1, = axes[1,1].plot(x, number_templates[:6], label = 'Number of Templates (Split function calls)')
n2, = axes[1,1].plot(x, number_templates_general[:6], label = 'Number of Templates (All calls are the same)')
n3, = axes[1,1].plot(x, [number_templates_general[6] for i in range(6)], label = 'Bound')
axes[1,1].set_title("Number of templates")
axes[1,1].set_xlabel('Threshold for Template Creation')
axes[1,1].set_ylabel('Number of templates')
axes[1,1].legend(handles=[n1,n2,n3])


Out[39]:
<matplotlib.legend.Legend at 0x117999e80>

In [ ]: